1 Introduction

Here, we will transfer the label and UMAP coordinates from the discovery cohort to the validation cohort. We will do it one compartment at a time (e.g. pc), so we can validate that the transfer is robust

Specifically, we will follow these steps for every compartment:

  1. Include or update the annotation label for this object.
  2. Plot the integrated object split by type (quey/reference) to visualize if discovery and validation cohorts were integrated properly.
  3. Transfer label.
  4. Transfer UMAP coordinates + visualize
  5. Exclude cells with a low annotation probability and/or a high doublet score
  6. Visualize heatmap of specific markers to validate that the cluster and annotation holds for the new cohort.

2 Pre-processing

2.1 Load packages

library(Seurat)
library(tidyverse)
library(caret)
library(class)
library(harmony)
library(here)
library(glue)
library(DT)
library(pheatmap2)

2.2 Source functions

source(here("scRNA-seq/bin/SLOcatoR_functions.R"))
source(here("scRNA-seq/bin/utils.R"))
source(here("scRNA-seq/bin/utils_final_clusters.R"))
color_palette <-  c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D", "#5A5156", "#AA0DFE",
                    "#F8A19F", "#F7E1A0", "#1C8356", "#FEAF16", "#822E1C", "#C4451C",
                    "#1CBE4F", "#325A9B", "#F6222E", "#FE00FA", "#FBE426", "#16FF32",
                    "black",   "#3283FE", "#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3",
                    "#90AD1C", "#FE00FA", "#85660D", "#3B00FB", "#822E1C", "coral2",
                    "#1CFFCE", "#1CBE4F", "#3283FE", "#FBE426", "#F7E1A0", "#325A9B",
                    "#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD", "#FEAF16", "#683B79",
                    "#B10DA1", "#1C7F93", "#F8A19F", "dark orange", "#FEAF16", "#FBE426",
                    "Brown")

3 PC

# Read data, clean and include annotation
pc <- readRDS(here("scRNA-seq/results/R_objects/seurat_objects_revision/4.9-PC_seurat_object_discovery_validation_cohorts.rds"))
pc <- updateAnnotation(pc)
pc$annotation_20230508 <- pc$annotation_20220619
pc$annotation_20230508[is.na(pc$annotation_20230508)] <- "unannotated"
pc$annotation_20230508 <- factor(
  pc$annotation_20230508,
  levels = c(names(colors_20230508[["PC"]]), "unannotated")
)
Idents(pc) <- "annotation_20230508"
pc$is_reference <- ifelse(
  pc$cohort_type == "discovery" & pc$assay == "3P",
  "reference",
  "query"
)
DimPlot(
  pc,
  group.by = "annotation_20230508",
  split.by = "cohort_type",
  cols = c(colors_20230508[["PC"]], "unannotated" = "black")
) &
  theme(legend.text = element_text(size = 6))

We stil see some clusters that did not integrate well. Let us inspect the doublet scores and markers of other populations:

FeaturePlot(pc, c("CD3D", "LYZ")) & scale_color_viridis_c(option = "magma")

FeaturePlot(pc[, pc$cohort_type == "validation"], "doublet_score_scDblFinder") &
  scale_color_viridis_c(option = "magma")

FeaturePlot(pc, c("S.Score", "G2M.Score")) & scale_color_viridis_c(option = "magma")

We see a clear cluster of T-cell doublets. Let us exclude them from the dataset:

pc <- FindNeighbors(pc, reduction = "harmony", dims = 1:25)
pc <- FindClusters(pc, resolution = 0.3)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 14913
## Number of edges: 603207
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9163
## Number of communities: 11
## Elapsed time: 3 seconds
DimPlot(pc, cols = color_palette)

markers6 <- FindMarkers(pc, ident.1 = "6", only.pos = TRUE, logfc.threshold = 0.6)
DT::datatable(markers6)
pc <- pc[, !(pc$cohort_type == "validation" & pc$seurat_clusters == "6")]
DimPlot(pc)

In addition, we see that cells from other compartments (GCBC, MBC) were included to map trajectories. For clarity, we will exclude them from the analysis:

exclude <- c("Dark Zone GCBC", "Light Zone GCBC")
pc <- pc[, !(pc$annotation_20230508 %in% exclude)]
pc$annotation_20230508 <- droplevels(pc$annotation_20230508)
colors_20230508$PC <- colors_20230508$PC[!(names(colors_20230508$PC) %in% exclude)]

Finally, let’s explore how doublet scores distribute per cluster, and remove any cluster with a high doublet score:

VlnPlot(pc[, pc$cohort_type == "validation"], features = "doublet_score_scDblFinder", pt.size = 0)

Reintegrate:

pc$type <- str_c(pc$cohort_type, pc$assay, sep = "_")
shared_hvg <- find_assay_specific_features(pc, assay_var = "type")
pc <- integrate_assays(
  pc,
  assay_var = "type",
  shared_hvg = shared_hvg
)
pc <- RunUMAP(pc, dims = 1:25, reduction = "harmony")
DimPlot(
  pc,
  group.by = "annotation_20230508",
  split.by = "cohort_type",
  cols = c(colors_20230508[["PC"]], "unannotated" = "gray")
)  &
  theme(legend.text = element_text(size = 6))

Transfer label:

# Split training and test sets
Idents(pc) <- "annotation_20230508"
data_sets <- split_training_and_test_sets(
  pc,
  split_var = "cohort_type",
  referece_label = "discovery",
  query_label = "validation",
  reduction = "harmony",
  n_dims = 25
)


# Transfer label
annotation_query_df <- transfer_label(
  seurat_obj = pc,
  training_set = data_sets$training_set,
  test_set = data_sets$test_set,
  k = 8,
  response_var = "annotation_20230508"
)

# Transfer UMAP coords
# umap_test_df <- transfer_umap_coords(
#   seurat_obj = pc,
#   training_set = data_sets$training_set,
#   test_set = data_sets$test_set,
#   umap1_var = "UMAP_1_20220215",
#   umap2_var = "UMAP_2_20220215",
#   k = 15
# )


# Include annotation and UMAP coords in Seurat object
pc$annotation_20230508[annotation_query_df$query_cells] <- annotation_query_df$annotation
Idents(pc) <- "annotation_20230508"
pc$annotation_20230508_probability <- NA
pc$annotation_20230508_probability[annotation_query_df$query_cells] <- annotation_query_df$annotation_prob
# pc$UMAP_1_20220215[umap_test_df$query_cells] <- umap_test_df$UMAP1
# pc$UMAP_2_20220215[umap_test_df$query_cells] <- umap_test_df$UMAP2
p1 <- FeaturePlot(
  pc[, annotation_query_df$query_cells],
  "annotation_20230508_probability"
) + scale_color_viridis_c(option = "magma")
p2 <- FeaturePlot(
  pc[, annotation_query_df$query_cells],
  "doublet_score_scDblFinder"
) + scale_color_viridis_c(option = "magma")
p1 | p2

DimPlot(
  pc,
  group.by = "annotation_20230508",
  split.by = "cohort_type",
  cols = colors_20230508[["PC"]]
)  &
  theme(legend.text = element_text(size = 6))

table(pc$annotation_20230508_probability)
## 
## 0.222222222222222              0.25 0.333333333333333             0.375 0.444444444444444               0.5             0.625 0.666666666666667              0.75 0.777777777777778             0.875 0.888888888888889                 1 
##                 1                69                 1               426                 3               824               900                 7               803                 2               852                 5              1288
FeaturePlot(
  pc[, pc$cohort_type == "validation"],
  c("annotation_20230508_probability", "doublet_score_scDblFinder")
) & scale_color_viridis_c(option = "magma")

There is a cluster with a high doublet score and a low annotation probability. We will flag it for later discussion.

Plot heatmap:

goi <- c("SUGCT", "AICDA", "CXCR4", "LMO2", "CD83", "BCL2A1", "BCL6", "IRF8",
         "MEF2B", "MS4A1", "PAX5", "PRDM1", "XBP1", "IRF4", "SLAMF7", "SSR4",
         "MZB1", "DERL3", "CREB3L2", "FKBP11", "IGHG1", "IGHG2", "IGHG3", "IGHG4",
         "IGHA1", "IGHA2", "IGHM", "IGHD", "CCDC50", "FCMR", "CD9", "CD44", "H2AFZ",
         "MKI67", "TUBA1B", "TOP2A")
avgexpr_mat <- AverageExpression(
  pc[, pc$cohort_type == "validation"],
  features = goi,
  assays = "RNA",
  return.seurat = FALSE,
  group.by = "annotation_20230508",
  slot = "data"
)$RNA


# Define annotation column
cell_types <- levels(pc$annotation_20230508)
mycolors <- list(cell_type = c(colors_20230508$PC))
annotation_col <- data.frame(cell_type = cell_types) 
rownames(annotation_col) <- cell_types


# Scale per row between 0 and 1 
input_mat <- t(apply(avgexpr_mat, 1, function(x) (x - min(x)) / diff(range(x))))


# Plot heatmap
pheatmap2(
  input_mat,
  annotation_col = annotation_col,
  annotation_colors = mycolors,
  annotation_names_col = TRUE,
  annotation_legend = TRUE,
  show_rownames = TRUE,
  show_colnames = FALSE, 
  border_color = NA,
  cluster_rows = FALSE,
  cluster_cols = FALSE,
  fontsize_row = 6
)

Plot annotation probability:

pc@meta.data %>%
  filter(cohort_type == "validation") %>%
  ggplot(aes(annotation_20230508, annotation_20230508_probability, fill = annotation_20230508)) +
    geom_boxplot() +
    scale_fill_manual(values = colors_20230508$PC) +
    theme_classic() +
    theme(legend.position = "none") +
    coord_flip()

4 Save

saveRDS(pc, here("scRNA-seq/results/R_objects/seurat_objects_revision/5.9-pc_seurat_object_discovery_validation_cohorts_annotation.rds"))

5 Session Information

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=es_ES.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=es_ES.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] pheatmap2_0.1.0    fastcluster_1.2.3  parallelDist_0.2.6 gtable_0.3.1       scales_1.2.1       RColorBrewer_1.1-3 DT_0.27            glue_1.6.2         here_1.0.1         harmony_0.1.1      Rcpp_1.0.10        class_7.3-21       caret_6.0-93       lattice_0.20-45    forcats_1.0.0      stringr_1.5.0      dplyr_1.1.0        purrr_1.0.1        readr_2.1.3        tidyr_1.3.0        tibble_3.1.8       ggplot2_3.4.0      tidyverse_1.3.2    SeuratObject_4.1.3 Seurat_4.3.0       BiocStyle_2.26.0  
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.3             spatstat.explore_3.0-6 reticulate_1.28        tidyselect_1.2.0       htmlwidgets_1.6.1      Rtsne_0.16             pROC_1.18.0            munsell_0.5.0          codetools_0.2-19       ica_1.0-3              future_1.31.0          miniUI_0.1.1.1         withr_2.5.0            spatstat.random_3.1-3  colorspace_2.1-0       progressr_0.13.0       highr_0.10             knitr_1.42             rstudioapi_0.14        stats4_4.2.0           ROCR_1.0-11            tensor_1.5             listenv_0.9.0          labeling_0.4.2         polyclip_1.10-4        farver_2.1.1           rprojroot_2.0.3        parallelly_1.34.0      vctrs_0.5.2            generics_0.1.3         ipred_0.9-13           xfun_0.37              timechange_0.2.0       R6_2.5.1               ggbeeswarm_0.7.1       spatstat.utils_3.0-1   cachem_1.0.6           promises_1.2.0.1       nnet_7.3-18            googlesheets4_1.0.1    beeswarm_0.4.0         globals_0.16.2         goftest_1.2-3          timeDate_4022.108      rlang_1.0.6            splines_4.2.0          lazyeval_0.2.2         ModelMetrics_1.2.2.2   gargle_1.3.0           spatstat.geom_3.0-6    broom_1.0.3           
##  [52] BiocManager_1.30.19    yaml_2.3.7             reshape2_1.4.4         abind_1.4-5            modelr_0.1.10          crosstalk_1.2.0        backports_1.4.1        httpuv_1.6.8           tools_4.2.0            lava_1.7.1             bookdown_0.32          ellipsis_0.3.2         jquerylib_0.1.4        ggridges_0.5.4         plyr_1.8.8             rpart_4.1.19           deldir_1.0-6           pbapply_1.7-0          cowplot_1.1.1          zoo_1.8-11             haven_2.5.1            ggrepel_0.9.3          cluster_2.1.4          fs_1.6.0               magrittr_2.0.3         data.table_1.14.6      scattermore_0.8        lmtest_0.9-40          reprex_2.0.2           RANN_2.6.1             googledrive_2.0.0      fitdistrplus_1.1-8     matrixStats_0.63.0     hms_1.1.2              patchwork_1.1.2        mime_0.12              evaluate_0.20          xtable_1.8-4           readxl_1.4.1           gridExtra_2.3          compiler_4.2.0         KernSmooth_2.23-20     crayon_1.5.2           htmltools_0.5.4        later_1.3.0            tzdb_0.3.0             RcppParallel_5.1.6     lubridate_1.9.1        DBI_1.1.3              dbplyr_2.3.2           MASS_7.3-58.2         
## [103] Matrix_1.5-3           cli_3.6.0              parallel_4.2.0         gower_1.0.1            igraph_1.3.5           pkgconfig_2.0.3        sp_1.6-0               plotly_4.10.1          spatstat.sparse_3.0-0  recipes_1.0.4          xml2_1.3.3             foreach_1.5.2          vipor_0.4.5            bslib_0.4.2            hardhat_1.2.0          prodlim_2019.11.13     rvest_1.0.3            digest_0.6.31          sctransform_0.3.5      RcppAnnoy_0.0.20       spatstat.data_3.0-0    rmarkdown_2.20         cellranger_1.1.0       leiden_0.4.3           uwot_0.1.14            shiny_1.7.4            lifecycle_1.0.3        nlme_3.1-162           jsonlite_1.8.4         limma_3.54.1           viridisLite_0.4.1      fansi_1.0.4            pillar_1.8.1           ggrastr_1.0.1          fastmap_1.1.0          httr_1.4.4             survival_3.5-0         png_0.1-8              iterators_1.0.14       stringi_1.7.12         sass_0.4.5             irlba_2.3.5.1          future.apply_1.10.0